In [1]:
# Computations
import numpy as np
import pandas as pd

# scipy
import scipy.stats as stats

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import KFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

# Visualisation libraries
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec

import missingno as msno

import plotly.offline as py
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot 
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Pima Indians Diabetes Data Classification

In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.

Context

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

Content

The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

Citations

Table of contents

Dataset Analysis

In [2]:
Data = pd.read_csv('pima-indians-diabetes-database/diabetes.csv')
display(Data.head())

print('The Dataset Shape: %i rows and %i columns' % Data.shape)
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
The Dataset Shape: 768 rows and 9 columns
Feature Explanations
Pregnancies Number of times pregnant
Glucose Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure Diastolic blood pressure (mm Hg)
SkinThickness Triceps skinfold thickness (mm)
Insulin 2-Hour serum insulin (mu U/ml)
BMI Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction Diabetes pedigree function
Age Age (years)
Outcome Whether or not a patient has diabetes
In [3]:
def Data_info(Inp, Only_NaN = False):
    Out = pd.DataFrame(Inp.dtypes,columns=['Data Type']).sort_values(by=['Data Type'])
    Out = Out.join(pd.DataFrame(Inp.isnull().sum(), columns=['Number of NaN Values']), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out
display(Data_info(Data).T[:2])
_ = msno.bar(Data, figsize=(12,3), fontsize=14, log=False, color="#34495e")
display(Data.describe())
Age BMI BloodPressure DiabetesPedigreeFunction Glucose Insulin Outcome Pregnancies SkinThickness
Data Type int64 float64 int64 float64 int64 int64 int64 int64 int64
Number of NaN Values 0 0 0 0 0 0 0 0 0
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

Let's take a close look at our data.

In [4]:
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
for i in range(len(Data.columns[:-1])):
    sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
                 kde_kws={"color": "k", "lw": 2, "label": "KDE"},
                 hist_kws={"histtype": "step", "linewidth": 2,
                           "alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
    if Data.iloc[:,i].name != 'BMI':
        ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))
In [5]:
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Data['Outcome']]

fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Data['Pregnancies']),
                              dict(label='Glucose', values=Data['Glucose']),
                              dict(label='Blood<br>Pressure', values=Data['BloodPressure']),
                              dict(label='Skin<br>Thickness', values=Data['SkinThickness']),
                              dict(label='Insulin', values=Data['Insulin']),
                              dict(label='BMI', values=Data['BMI']),
                              dict(label='Diabetes<br>Pedigree<br>Fun', values=Data['DiabetesPedigreeFunction']),
                              dict(label='Age', values=Data['Age'])],
                              showupperhalf=False,
                              marker=dict(color=Data['Outcome'], size=4, colorscale='Bluered',
                              line=dict(width=0.4, color='black')),
                              text=Temp, diagonal=dict(visible=False)))
del Temp
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
                  width=900, height=900, hovermode='closest')
fig.show()

As can be seen, the Data has a normal distribution, and some entries need to be adjusted. In doing so, we defined a normalizer as follows, for a given vector $x$,

\begin{align*} \text{Normalizer}(x, cut) = \begin{cases} x_i &\mbox{if } |x_i- \mu|<\sigma\times cut \\ mode(x) & \mbox{else} \end{cases}. \end{align*}
In [6]:
def Normalizer(Col, cut = 3):
    return Col[(Col > (Col.mean() - Col.std() * cut)) &
               (Col < (Col.mean() + Col.std() * cut))]

fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
Temp = Data.copy()
for i in range(len(Data.columns[:-1])):
    Data[Data.columns[i]] = Normalizer(Data[Data.columns[i]])
    Data[Data.columns[i]] = Data[Data.columns[i]].fillna(Data[Data.columns[i]].dropna().mode()[0])
    # Sub-Plots
    sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
                 kde_kws={"color": "k", "lw": 2, "label": "KDE"},
                 hist_kws={"histtype": "step", "linewidth": 2,
                           "alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
    if Data.iloc[:,i].name != 'BMI':
        ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))

Basically, we diminished the influence of certain data points (see the following figure).

In [7]:
Temp0 = Temp.copy()
Temp0.iloc[:,:-1] = abs(Data.iloc[:,:-1] - Temp.iloc[:,:-1])

Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Temp0['Outcome']]

fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Temp0['Pregnancies']),
                              dict(label='Glucose', values=Temp0['Glucose']),
                              dict(label='Blood<br>Pressure', values=Temp0['BloodPressure']),
                              dict(label='Skin<br>Thickness', values=Temp0['SkinThickness']),
                              dict(label='Insulin', values=Temp0['Insulin']),
                              dict(label='BMI', values=Temp0['BMI']),
                              dict(label='Diabetes<br>Pedigree<br>Fun', values=Temp0['DiabetesPedigreeFunction']),
                              dict(label='Age', values=Temp0['Age'])],
                              showupperhalf=False,
                              marker=dict(color=Temp0['Outcome'], size=4, colorscale='Bluered',
                              line=dict(width=0.4, color='black')),
                              text=Temp, diagonal=dict(visible=False)))
del Temp, Temp0
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
                  width=900, height=900, hovermode='closest')
fig.show()

Data Correlation

In [8]:
def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr()
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .7})
    bottom, top = ax.get_ylim()

Correlation_Plot (Data, 9)
In [9]:
Temp = Data.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')
display(Temp)
Temp0 = Data.corr()
Temp0.loc[Temp.index[-1]].sort_values().to_frame(name= 'Correlation')[:-1].T
Variance
Insulin 7844.510917
Glucose 929.680350
SkinThickness 246.979708
BloodPressure 146.573540
Age 128.991301
BMI 43.941176
Pregnancies 10.734190
DiabetesPedigreeFunction 0.078702
Out[9]:
Pregnancies BloodPressure Age Glucose BMI SkinThickness Insulin Outcome
Correlation 0.015703 0.034428 0.066525 0.095686 0.122868 0.15229 0.184028 0.192156

Even though the variance of Diabetes Pedigree Function is low, this might not improve the performance of the model, the correlation of this feature with the reset of features, especially with the Outcome, is noticeable.

Modeling and Classification

In [10]:
Target = 'Outcome'

X = Data.drop(columns = [Target])
y = Data[Target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[10]:
Set X_train X_test y_train y_test
Shape (537, 8) (231, 8) (537,) (231,)

KNeighborsClassifier

The first classifier that we use here is KNeighborsClassifier with GridSearchCV.

In [11]:
grid_knn = RandomizedSearchCV(KNeighborsClassifier(),
                              {'n_neighbors': list(np.arange(1, 101, 10))},
                              cv = KFold(n_splits = X.shape[1], shuffle = True),  
                              n_iter = 30,
                              scoring = 'roc_auc', 
                              error_score = 0, 
                              verbose = 3, 
                              n_jobs = 10,
                              refit = True)
grid_knn.fit(X_train,y_train)

display(pd.DataFrame({'Best Score': [grid_knn.best_score_],
                      'Best Paramerers': [str(grid_knn.best_params_)],
                      'Accuracy': [grid_knn.score(X_test,y_test)]}).round(4).style.hide_index())

Temp = [str(x) for x in grid_knn.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid_knn.cv_results_['rank_test_score'],
                      'params':Temp,
                      'mean_test_score': grid_knn.cv_results_['mean_test_score'],
                      'mean_fit_time': grid_knn.cv_results_['mean_fit_time']})
display(Table.round(4).sort_values('rank_test_score').set_index('rank_test_score').head(10))

# Plot the results of the grid search.
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
_ = axes[0].errorbar(x = Table['params'],
                     y = Table['mean_test_score'],
                     yerr = Table['mean_test_score'])
_ = axes[0].set(xlabel = 'Paramerers', title='Classification accuracy')
_ = axes[0].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[0].set_ylim(bottom = 0)

_ = axes[1].errorbar(x = Table['params'],
                     y = Table['mean_fit_time'],
                     yerr = Table['mean_fit_time'], color='r')
_ = axes[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = axes[1].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[1].set_ylim(bottom = 0)
fig.tight_layout()

# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_knn.predict_proba(X_test)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
Fitting 8 folds for each of 10 candidates, totalling 80 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    1.0s
[Parallel(n_jobs=10)]: Done  80 out of  80 | elapsed:    1.1s finished
Best Score Best Paramerers Accuracy
0.817700 {'n_neighbors': 31} 0.757000
params mean_test_score mean_fit_time
rank_test_score
1 n_neighbors: 31 0.8177 0.0015
2 n_neighbors: 21 0.8143 0.0014
3 n_neighbors: 41 0.8111 0.0015
4 n_neighbors: 51 0.8093 0.0016
5 n_neighbors: 11 0.8076 0.0016
6 n_neighbors: 61 0.8063 0.0016
7 n_neighbors: 71 0.8052 0.0018
8 n_neighbors: 81 0.8007 0.0015
9 n_neighbors: 91 0.7999 0.0015
10 n_neighbors: 1 0.6565 0.0021

Let's compute receiver operating characteristic (ROC) and plot them.

In [12]:
# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_knn.predict_proba(X_test)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5, 5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')

Furthermore, we would like to demonstrate the importance of standardizing features by removing the mean and scaling to unit variance.

In [13]:
scaler = StandardScaler()

X_train_STD = scaler.fit_transform(X_train)
X_test_STD = scaler.transform(X_test)
In [14]:
grid_knn = RandomizedSearchCV(KNeighborsClassifier(),
                              {'n_neighbors': list(np.arange(1, 101, 10))},
                              cv = KFold(n_splits = X.shape[1], shuffle = True),  
                              n_iter = 30,
                              scoring = 'roc_auc', 
                              error_score = 0, 
                              verbose = 3, 
                              n_jobs = 10,
                              refit = True)
grid_knn.fit(X_train_STD,y_train)

display(pd.DataFrame({'Best Score': [grid_knn.best_score_],
                      'Best Paramerers': [str(grid_knn.best_params_)],
                      'Accuracy': [grid_knn.score(X_test_STD,y_test)]}).round(4).style.hide_index())

Temp = [str(x) for x in grid_knn.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid_knn.cv_results_['rank_test_score'],
                      'params':Temp,
                      'mean_test_score': grid_knn.cv_results_['mean_test_score'],
                      'mean_fit_time': grid_knn.cv_results_['mean_fit_time']})
display(Table.round(4).sort_values('rank_test_score').set_index('rank_test_score').head(10))

# Plot the results of the grid search.
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
_ = axes[0].errorbar(x = Table['params'],
                     y = Table['mean_test_score'],
                     yerr = Table['mean_test_score'])
_ = axes[0].set(xlabel = 'Paramerers', title='Classification accuracy')
_ = axes[0].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[0].set_ylim(bottom = 0)

_ = axes[1].errorbar(x = Table['params'],
                     y = Table['mean_fit_time'],
                     yerr = Table['mean_fit_time'], color='r')
_ = axes[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = axes[1].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[1].set_ylim(bottom = 0)
fig.tight_layout()

# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_knn.predict_proba(X_test_STD)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
Fitting 8 folds for each of 10 candidates, totalling 80 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done  80 out of  80 | elapsed:    0.0s finished
Best Score Best Paramerers Accuracy
0.832200 {'n_neighbors': 31} 0.782200
params mean_test_score mean_fit_time
rank_test_score
1 n_neighbors: 31 0.8322 0.0009
2 n_neighbors: 81 0.8306 0.0007
3 n_neighbors: 71 0.8268 0.0010
4 n_neighbors: 91 0.8260 0.0009
5 n_neighbors: 21 0.8252 0.0010
6 n_neighbors: 41 0.8249 0.0009
7 n_neighbors: 61 0.8243 0.0010
8 n_neighbors: 51 0.8231 0.0007
9 n_neighbors: 11 0.8075 0.0010
10 n_neighbors: 1 0.6794 0.0010

Decision Tree Classifier

For more details regarding, please see this link

In [15]:
grid_dtc = RandomizedSearchCV(DecisionTreeClassifier(),
                              {'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)},
                              cv = KFold(n_splits = X.shape[1], shuffle = True),  
                              n_iter = 30,
                              scoring = 'roc_auc', 
                              error_score = 0, 
                              verbose = 3, 
                              n_jobs = 10,
                              refit = True)
grid_dtc.fit(X_train_STD,y_train)

display(pd.DataFrame({'Best Score': [grid_dtc.best_score_],
                      'Best Paramerers': [str(grid_dtc.best_params_)],
                      'Accuracy': [grid_dtc.score(X_test_STD,y_test)]}).round(4).style.hide_index())

Temp = [str(x) for x in grid_dtc.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid_dtc.cv_results_['rank_test_score'],
                      'params':Temp,
                      'mean_test_score': grid_dtc.cv_results_['mean_test_score'],
                      'mean_fit_time': grid_dtc.cv_results_['mean_fit_time']})
display(Table.round(4).sort_values('rank_test_score').set_index('rank_test_score').head(10))

# Plot the results of the grid search.
fig, axes = plt.subplots(1, 2, figsize=(10, 7))
_ = axes[0].errorbar(x = Table['params'],
                     y = Table['mean_test_score'],
                     yerr = Table['mean_test_score'])
_ = axes[0].set(xlabel = 'Paramerers', title='Classification accuracy')
_ = axes[0].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[0].set_ylim(bottom = 0)

_ = axes[1].errorbar(x = Table['params'],
                     y = Table['mean_fit_time'],
                     yerr = Table['mean_fit_time'], color='r')
_ = axes[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = axes[1].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[1].set_ylim(bottom = 0)
fig.tight_layout()

# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_dtc.predict_proba(X_test_STD)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
Fitting 8 folds for each of 24 candidates, totalling 192 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 192 out of 192 | elapsed:    0.1s finished
Best Score Best Paramerers Accuracy
0.813500 {'max_depth': 4, 'criterion': 'gini'} 0.772600
params mean_test_score mean_fit_time
rank_test_score
1 max_depth: 4, criterion: gini 0.8135 0.0012
2 max_depth: 3, criterion: gini 0.8030 0.0010
3 max_depth: 4, criterion: entropy 0.7988 0.0014
4 max_depth: 3, criterion: entropy 0.7951 0.0011
5 max_depth: 5, criterion: entropy 0.7856 0.0018
6 max_depth: 5, criterion: gini 0.7764 0.0012
7 max_depth: 6, criterion: gini 0.7685 0.0015
8 max_depth: 6, criterion: entropy 0.7684 0.0018
9 max_depth: 2, criterion: gini 0.7607 0.0011
10 max_depth: 7, criterion: gini 0.7547 0.0016

C-Support Vector Classification

For more details regarding, please see this link

In [16]:
grid_svc = RandomizedSearchCV(SVC(probability=True),
                              {'kernel':['rbf','linear'], 'C':[10.0**n for n in np.arange(-2,3)],
                               'gamma':[10.0**n for n in np.arange(-4,-1)]},
                              cv = KFold(n_splits = X.shape[1], shuffle = True),  
                              n_iter = 30,
                              scoring = 'roc_auc', 
                              error_score = 0, 
                              verbose = 3, 
                              n_jobs = 10,
                              refit = True)
grid_svc.fit(X_train_STD,y_train)

display(pd.DataFrame({'Best Score': [grid_svc.best_score_],
                      'Best Paramerers': [str(grid_svc.best_params_)],
                      'Accuracy': [grid_svc.score(X_test_STD,y_test)]}).round(4).style.hide_index())

Temp = [str(x) for x in grid_svc.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid_svc.cv_results_['rank_test_score'],
                      'params':Temp,
                      'mean_test_score': grid_svc.cv_results_['mean_test_score'],
                      'mean_fit_time': grid_svc.cv_results_['mean_fit_time']})
display(Table.round(4).sort_values('rank_test_score').set_index('rank_test_score').head(10))

# Plot the results of the grid search.
fig, axes = plt.subplots(1, 2, figsize=(10, 7))
_ = axes[0].errorbar(x = Table['params'],
                     y = Table['mean_test_score'],
                     yerr = Table['mean_test_score'])
_ = axes[0].set(xlabel = 'Paramerers', title='Classification accuracy')
_ = axes[0].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[0].set_ylim(bottom = 0)

_ = axes[1].errorbar(x = Table['params'],
                     y = Table['mean_fit_time'],
                     yerr = Table['mean_fit_time'], color='r')
_ = axes[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = axes[1].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[1].set_ylim(bottom = 0)
fig.tight_layout()

# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_svc.predict_proba(X_test_STD)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    2.3s finished
Best Score Best Paramerers Accuracy
0.851700 {'kernel': 'linear', 'gamma': 0.0001, 'C': 0.1} 0.798400
params mean_test_score mean_fit_time
rank_test_score
1 kernel: linear, gamma: 0.01, C: 0.1 0.8517 0.0091
1 kernel: linear, gamma: 0.0001, C: 0.1 0.8517 0.0091
1 kernel: linear, gamma: 0.001, C: 0.1 0.8517 0.0089
4 kernel: rbf, gamma: 0.01, C: 1.0 0.8514 0.0143
5 kernel: rbf, gamma: 0.01, C: 0.1 0.8506 0.0146
6 kernel: rbf, gamma: 0.001, C: 0.01 0.8502 0.0156
7 kernel: rbf, gamma: 0.0001, C: 0.1 0.8502 0.0150
8 kernel: rbf, gamma: 0.001, C: 100.0 0.8500 0.0140
9 kernel: rbf, gamma: 0.01, C: 0.01 0.8499 0.0153
10 kernel: rbf, gamma: 0.001, C: 0.1 0.8499 0.0158

Random Forest Classifier

For more details regarding, please see this link

In [17]:
grid_rfc = RandomizedSearchCV(RandomForestClassifier(),
                              {'n_estimators': [n*100 for n in [2**m for m in np.arange(0,4)]],
                               'max_depth': list(np.arange(2,5)),
                               'min_samples_leaf': [round(x,1) for x in np.linspace(0,1,6)],
                               'max_features':['auto','sqrt'],
                               'criterion':['gini','entropy']},
                              cv = KFold(n_splits = X.shape[1], shuffle = True),  
                              n_iter = 30,
                              scoring = 'roc_auc', 
                              error_score = 0, 
                              verbose = 3, 
                              n_jobs = 10,
                              refit = True)
grid_rfc.fit(X_train_STD,y_train)

display(pd.DataFrame({'Best Score': [grid_rfc.best_score_],
                      'Best Paramerers': [str(grid_rfc.best_params_)],
                      'Accuracy': [grid_rfc.score(X_test_STD,y_test)]}).round(4).style.hide_index())

Temp = [str(x) for x in grid_rfc.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid_rfc.cv_results_['rank_test_score'],
                      'params':Temp,
                      'mean_test_score': grid_rfc.cv_results_['mean_test_score'],
                      'mean_fit_time': grid_rfc.cv_results_['mean_fit_time']})
display(Table.round(4).sort_values('rank_test_score').set_index('rank_test_score').head(10))

# Plot the results of the grid search.
fig, axes = plt.subplots(1, 2, figsize=(10, 10))
_ = axes[0].errorbar(x = Table['params'],
                     y = Table['mean_test_score'],
                     yerr = Table['mean_test_score'])
_ = axes[0].set(xlabel = 'Paramerers', title='Classification accuracy')
_ = axes[0].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[0].set_ylim(bottom = 0)

_ = axes[1].errorbar(x = Table['params'],
                     y = Table['mean_fit_time'],
                     yerr = Table['mean_fit_time'], color='r')
_ = axes[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = axes[1].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[1].set_ylim(bottom = 0)
fig.tight_layout()

# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_rfc.predict_proba(X_test_STD)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 196 tasks      | elapsed:    3.8s
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    6.5s finished
Best Score Best Paramerers Accuracy
0.830700 {'n_estimators': 800, 'min_samples_leaf': 0.2, 'max_features': 'auto', 'max_depth': 4, 'criterion': 'entropy'} 0.781900
params mean_test_score mean_fit_time
rank_test_score
1 n_estimators: 800, min_samples_leaf: 0.2, max_... 0.8307 0.7434
2 n_estimators: 800, min_samples_leaf: 0.2, max_... 0.8302 0.7167
3 n_estimators: 400, min_samples_leaf: 0.2, max_... 0.8300 0.3749
4 n_estimators: 200, min_samples_leaf: 0.2, max_... 0.8295 0.1871
5 n_estimators: 400, min_samples_leaf: 0.4, max_... 0.5000 0.3591
5 n_estimators: 800, min_samples_leaf: 0.4, max_... 0.5000 0.7067
5 n_estimators: 800, min_samples_leaf: 0.4, max_... 0.5000 0.7070
5 n_estimators: 800, min_samples_leaf: 0.4, max_... 0.5000 0.7245
9 n_estimators: 200, min_samples_leaf: 0.6, max_... 0.0000 0.0565
9 n_estimators: 200, min_samples_leaf: 1.0, max_... 0.0000 0.0530

XGBoost Classifier

For more details regarding XGBoost classification, please see this link

In [18]:
grid_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective = 'binary:logistic'), 
                              param_distributions = {'booster':['gbtree','gblinear'],
                              'colsample_bytree': np.round(np.arange(0.4,1.2,0.2),1).tolist(),
                              'learning_rate': [0.001, 0.01,0.1,0.2,0.4],
                              'max_depth': np.arange(2,8).tolist(),
                              'n_estimators': [100*n for n in [2**m for m in np.arange(1,4)]],
                              'subsample': np.round(np.arange(0.4,1.2,0.2),1).tolist()},
                              cv = KFold(n_splits = X.shape[1], shuffle = True),  
                              n_iter = 30,
                              scoring = 'roc_auc', 
                              error_score = 0, 
                              verbose = 3, 
                              n_jobs = 10,
                              refit = True)
grid_xgb.fit(X_train_STD,y_train)

display(pd.DataFrame({'Best Score': [grid_xgb.best_score_],
                      'Best Paramerers': [str(grid_xgb.best_params_)],
                      'Accuracy': [grid_xgb.score(X_test_STD,y_test)]}).round(4).style.hide_index())

Temp = [str(x) for x in grid_xgb.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid_xgb.cv_results_['rank_test_score'],
                      'params':Temp,
                      'mean_test_score': grid_xgb.cv_results_['mean_test_score'],
                      'mean_fit_time': grid_xgb.cv_results_['mean_fit_time']})
display(Table.round(4).sort_values('rank_test_score').set_index('rank_test_score').head(10))

# Plot the results of the grid search.
fig, axes = plt.subplots(1, 2, figsize=(10, 11))
_ = axes[0].errorbar(x = Table['params'],
                     y = Table['mean_test_score'],
                     yerr = Table['mean_test_score'])
_ = axes[0].set(xlabel = 'Paramerers', title='Classification accuracy')
_ = axes[0].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[0].set_ylim(bottom = 0)

_ = axes[1].errorbar(x = Table['params'],
                     y = Table['mean_fit_time'],
                     yerr = Table['mean_fit_time'], color='r')
_ = axes[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = axes[1].set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = axes[1].set_ylim(bottom = 0)
fig.tight_layout()

# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, grid_xgb.predict_proba(X_test_STD)[:,1])

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.8s
[Parallel(n_jobs=10)]: Done 136 tasks      | elapsed:    2.5s
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    3.8s finished
Best Score Best Paramerers Accuracy
0.854800 {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 1.0, 'booster': 'gblinear'} 0.797800
params mean_test_score mean_fit_time
rank_test_score
1 subsample: 0.8, n_estimators: 800, max_depth: ... 0.8548 0.1236
1 subsample: 0.6, n_estimators: 400, max_depth: ... 0.8548 0.0596
1 subsample: 0.4, n_estimators: 800, max_depth: ... 0.8548 0.1227
1 subsample: 0.6, n_estimators: 200, max_depth: ... 0.8548 0.0349
1 subsample: 0.6, n_estimators: 400, max_depth: ... 0.8548 0.0582
1 subsample: 1.0, n_estimators: 800, max_depth: ... 0.8548 0.1195
1 subsample: 0.4, n_estimators: 800, max_depth: ... 0.8548 0.1216
1 subsample: 0.6, n_estimators: 400, max_depth: ... 0.8548 0.0624
1 subsample: 0.8, n_estimators: 400, max_depth: ... 0.8548 0.0610
1 subsample: 1.0, n_estimators: 200, max_depth: ... 0.8548 0.0769

Voting Classifier

In this article, we talked about stacking classifiers. However, here, we would like to use a Voting classifier model which combines multiple different models into a single model. For more details please see sklearn.ensemble.VotingClassifier.

We use two approaches for our voting classifiers, soft and hard. Hard uses predicted class labels for majority rule voting; however, soft predicts the class label based on the arguments of the maxima (the argmax) of the sums of the predicted probabilities.

In [19]:
Classifiers = [('KNN',grid_knn),
               ('Decision Tree', grid_dtc),
               ('SVC', grid_svc),
               ('Random Forest', grid_rfc),
               ('XGBoost', grid_xgb)]

# Voting Classifier (hard)
vtc_hard = VotingClassifier(estimators= Classifiers, voting='hard')
_ = vtc_hard.fit(X_train_STD, y_train)

# Voting Classifier (soft)
vtc_soft = VotingClassifier(estimators= Classifiers, voting='soft')
_ = vtc_soft.fit(X_train_STD, y_train)

print('===============================================================================')
print ('Voting Classifier (Hard Voting) Score %.2f' % vtc_hard.score(X_test_STD,y_test))
print ('Voting Classifier (Soft Voting) Score %.2f' % vtc_soft.score(X_test_STD,y_test))
print('===============================================================================')
Fitting 8 folds for each of 10 candidates, totalling 80 fits
Fitting 8 folds for each of 24 candidates, totalling 192 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done  80 out of  80 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 192 out of 192 | elapsed:    0.1s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    2.3s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.1s
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    4.4s finished
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    2.4s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done  80 out of  80 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
Fitting 8 folds for each of 10 candidates, totalling 80 fits
Fitting 8 folds for each of 24 candidates, totalling 192 fits
[Parallel(n_jobs=10)]: Done 192 out of 192 | elapsed:    0.1s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    2.4s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Done 221 out of 240 | elapsed:    4.1s remaining:    0.3s
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    5.2s finished
Fitting 8 folds for each of 30 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 221 out of 240 | elapsed:    2.7s remaining:    0.1s
===============================================================================
Voting Classifier (Hard Voting) Score 0.76
Voting Classifier (Soft Voting) Score 0.76
===============================================================================
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:    3.5s finished